Dependencies

This document depends on the following packages:

# Load packages
suppressPackageStartupMessages(library("BBmisc"))

pkgs <- c("stringi", "stringr", "tm", "wordcloud", "lda", "LDAvis", 
          "stm", "stmBrowser", "lubridate", "qdap", "png", "grid")

suppressAll(lib(pkgs)); rm(pkgs)

Data Import

The Hillary Clinton’s Email dataset is publicly available and was downladed from Kaggle website. For this link to work one must be logged in to a Kaggle user account.

dat <- read.csv("dataset/Emails.csv",
                sep = ",",
                encoding = "UTF-8", 
                header = T, 
                stringsAsFactors = F)

The description of the datset variables can be found in the Kaggle main page for the Hillary Clinton’s Email dataset.

dim(dat)
## [1] 7945   22
names(dat)
##  [1] "Id"                           "DocNumber"                   
##  [3] "MetadataSubject"              "MetadataTo"                  
##  [5] "MetadataFrom"                 "SenderPersonId"              
##  [7] "MetadataDateSent"             "MetadataDateReleased"        
##  [9] "MetadataPdfLink"              "MetadataCaseNumber"          
## [11] "MetadataDocumentClass"        "ExtractedSubject"            
## [13] "ExtractedTo"                  "ExtractedFrom"               
## [15] "ExtractedCc"                  "ExtractedDateSent"           
## [17] "ExtractedCaseNumber"          "ExtractedDocNumber"          
## [19] "ExtractedDateReleased"        "ExtractedReleaseInPartOrFull"
## [21] "ExtractedBodyText"            "RawText"

Data Cleaning

Before anything we must tidy up the data a little bit.

# Check encoding of text 
table(Encoding(dat$ExtractedBodyText))
## 
## unknown   UTF-8 
##    7381     564

The dataset encoding is not fully recognised which can cause some problems. I will first try to address this issue.

text <- dat$ExtractedBodyText
table(stri_enc_mark(text))
## 
## ASCII UTF-8 
##  7381   564

The stringi stri_enc_mark function is able to recognise the uknown encoding as ASCII. I will now try to standardize the encoding.

# Enforce UTF-8 (Not working)
text <- stri_encode(text, "ASCII", "UTF-8")
table(Encoding(text))
## 
## unknown   UTF-8 
##    7381     564
text <- enc2native(text)
table(Encoding(text))
## 
## unknown   UTF-8 
##    7381     564

The encoding conversion did not work so it will be left as is for the time being.

It will be interesting to identify which emails were sent to or from Hillary Clinton. So now we focus on organising the sender informartion.

# Identify emails from Hillary
head(dat[, c("MetadataFrom", "SenderPersonId")]) # SenderPersonId == 80
##        MetadataFrom SenderPersonId
## 1 Sullivan, Jacob J             87
## 2                               NA
## 3   Mills, Cheryl D             32
## 4   Mills, Cheryl D             32
## 5                 H             80
## 6                 H             80
# 157 emails witout sender information
sum(dat$MetadataFrom == "") 
## [1] 157
# Have a look at the emails with empty sender
# dat$ExtractedBodyText[dat$MetadataFrom == ""] # Lots of empty text as well

Some emails we can deduce it’s from or to Hillary but I’ll remove them since they are not so many.

# Remove cases with MetadataFrom == ""
dat <- dat[dat$MetadataFrom != "",]
# dim(dat)

There are over 100 emails without text.

sum(dat$ExtractedBodyText == "") # 1203 emails have no body text
## [1] 1067

I’ll have a look at the raw text emails to see if I can retrive some emails that have not been previously extracted.

# Look at RawText to see if some more emails can be extracted
dat$RawText[dat$ExtractedBodyText == ""][1]
## [1] "UNCLASSIFIED\nU.S. Department of State\nCase No. F-2015-04841\nDoc No. C05739545\nDate: 05/13/2015\nSTATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.\nSUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER.\nRELEASE IN FULL\nFrom: Sullivan, Jacob J <Sullivan11@state.gov>\nSent: Wednesday, September 12, 2012 10:16 AM\nTo:\nSubject: FW: Wow\nFrom: Brose, Christian (Armed Services) (mailto:Christian_Brose@armed-servic,essenate.govi\nSent: Wednesday, September 12, 2012 10:09 AM\nTo: Sullivan, Jacob J\nSubject: Wow\nWhat a wonderful, strong and moving statement by your boss. please tell her how much Sen. McCain appreciated it. Me\ntoo\nUNCLASSIFIED\nU.S. Department of State\nCase No. F-2015-04841\nDoc No. C05739545\nDate: 05/13/2015\nSTATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.\nSUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER. STATE-5CB0045247\n\f"

Many emails have text between “Subject:” and “UNCLASSIFIED”

# Extract text
emptyBodyText_Idx <- dat$ExtractedBodyText == ""

# replace all line breaks from RawText with a space
dat$RawText <- gsub("\n", " ", dat$RawText )

# Copy every thing between "Subject: and "UNCLASSIFIED""
text_field <- "Subject\\: (.*)UNCLASSIFIED"

# 839 extra texts can be retrived
sum(str_detect(dat$RawText[emptyBodyText_Idx], text_field)) 
## [1] 839
# retrive text
retrived_text <- str_extract(dat$RawText[emptyBodyText_Idx], text_field)
# Clean it up
NAidx <- is.na(retrived_text)
retrived_text[NAidx] <- ""
subject <- "Subject\\:(.*)Subject\\:"
retrived_text <- gsub(subject, "", retrived_text)
retrived_text <- gsub("UNCLASSIFIED", "", retrived_text)
head(retrived_text, 1)
## [1] " Wow What a wonderful, strong and moving statement by your boss. please tell her how much Sen. McCain appreciated it. Me too "

Enter recovered text to the empty ExtractedBodyText variable.

# Add retrived text
dat$ExtractedBodyText[emptyBodyText_Idx] <- retrived_text

# Remove emails still without body text
empty_idx <- dat$ExtractedBodyText == ""
sum(empty_idx) # 364
## [1] 228
dat <- dat[!empty_idx, ]
dim(dat) # 7581   22
## [1] 7560   22

Dates will be interesting covariates in our analysis, so we will now focus on cleaning their data.

# Clean up date variable
date <- dat$MetadataDateSent
date <- gsub("T(.*)", "", date)
date <- ymd(date)

sum(is.na(date)) # created 3 NAs
## [1] 3
na_idx <- which(is.na(date))
dat$MetadataDateSent[na_idx] # was empty in MetadataDateSent
## [1] "" "" ""
dat$ExtractedDateSent[na_idx] # was empty in ExtractedDateSent as well
## [1] "" "" ""

Input the few missing dates

# Check the dates before and after the NAs to see if I can use for inputation

# Day after 
dat$MetadataDateSent[na_idx+1]
## [1] "2010-08-16T04:00:00+00:00" "2010-02-22T05:00:00+00:00"
## [3] "2010-02-23T05:00:00+00:00"
# Day before
dat$MetadataDateSent[na_idx-1]
## [1] "2010-08-16T04:00:00+00:00" "2010-02-22T05:00:00+00:00"
## [3] "2010-02-23T05:00:00+00:00"
# Input date with the next day info
date[na_idx] <- date[na_idx+1]
sum(is.na(date))
## [1] 0

Create variables for days, months and year

day_month <- day(date)
day_week <- wday(date, label = TRUE, abbr = FALSE)
month <- month(date, label = TRUE, abbr = FALSE)
year <- year(date)
# The emails are from 2009 to 2014
table(year)
## year
## 2009 2010 2011 2012 2014 
## 3378 3893   45  243    1
# Make new dataframe to store processed data
emails <- data.frame(to = dat$ExtractedTo,
                     from = dat$ExtractedFrom,
                     senderId = dat$SenderPersonId,
                     released = dat$ExtractedReleaseInPartOrFull,
                     emails = dat$ExtractedBodyText,
                     text = dat$ExtractedBodyText,
                     rawtext = dat$RawText,
                     date = date,
                     day_month = day_month,
                     day_week = day_week,
                     month = month,
                     year = year)

# Clean up environment
rm(list = ls()[-5]) #keep only emails data.frame
dim(emails)
## [1] 7560   12
names(emails)
##  [1] "to"        "from"      "senderId"  "released"  "emails"   
##  [6] "text"      "rawtext"   "date"      "day_month" "day_week" 
## [11] "month"     "year"

Create a binary variable from_to to denote if the email was sent or recived by Hillary.

# Classify emails as to and from Hillary
emails$from_to <- as.factor(ifelse(emails$senderId == 80, "From Hillary", "To Hillary"))

table(emails$from_to)
## 
## From Hillary   To Hillary 
##         1978         5582
# Quick look at some emails text
head(emails$text, 4)
## [1] " Wow What a wonderful, strong and moving statement by your boss. please tell her how much Sen. McCain appreciated it. Me too "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
## [2] "Thx"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## [3] "H <hrod17@clintonemail.com>\nFriday, March 11, 2011 1:36 PM\nHuma Abedin\nFw: H: Latest: How Syria is aiding Qaddafi and more... Sid\nhrc memo syria aiding libya 030311.docx\nPis print."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
## [4] "Pis print.\n-•-...-^\nH < hrod17@clintonernailcom>\nWednesday, September 12, 2012 2:11 PM\n°Russorv@state.gov'\nFw: Meet The Right-Wing Extremist Behind Anti-fvluslim Film That Sparked Deadly Riots\nFrom [meat)\nSent: Wednesday, September 12, 2012 01:00 PM\nTo: 11\nSubject: Meet The Right Wing Extremist Behind Anti-Muslim Film That Sparked Deadly Riots\nhtte/maxbiumenthal.com12012/09/meet-the-right-wing-extremist-behind-anti-musiim-tihn-that-sparked-\ndeadly-riots/\nSent from my Verizon Wireless 4G LTE DROID\nU.S. Department of State\nCase No. F-2015-04841\nDoc No. C05739559\nDate: 05/13/2015\nSTATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.\nSUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER. STATE-5CB0045251"

The text needs some cleaning

text <- emails$text

# Remove email addresses
email_address <- "<(.*)>"
text <- gsub(email_address, " ", text)
text <- gsub("(.*)\\.com", " ", text)
text <- gsub("^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}$", "", text)
text <- gsub("(.*)@state\\.gov", "", text)

# Remove line break
text <- gsub("\\n", " ", text)

# Remove dates
week_days <- "(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)"
months <- "(January|February|March|April|May|June|July|August|September|October|November|December)"

# Remove dates
text <- gsub(week_days, " ", text)
text <- gsub(months, " ", text)

# More tidy up
text <- gsub("[A-P]M", " ", text)
text <- gsub("B\\d", " ", text)
text <- gsub("\"", " ", text)
text <- gsub("[T-t]o\\:|[F-f]rom\\:|H\\:|[F-f]or\\:|[S-s]ent\\:|[R-r][E-e]\\:|FW\\:|Fw\\:|Fwd\\:|mailto\\:|Tel\\:", " ", text)
text <- gsub("Subject\\:", "", text)
text <- gsub("\\/(.*)\\/", "", text)
text <- gsub("^http\\:(.*)", "", text)

# head(text, 50)
# tail(text, 50)
# text[1000:1050]
# Correct some words
# Pis
text <- gsub("Pis", "Pls", text)

# More pre-processing:
text <- gsub("'", "", text)  # remove apostrophes
text <- gsub("•", "", text) # remove •
text <- gsub("[[:punct:]]", "", text)  # remove punctuation 
text <- gsub("[[:cntrl:]]", " ", text)  # replace control characters with space
text <- gsub("^[[:space:]]+", "", text) # remove whitespace at beginning of documents
text <- gsub("[[:space:]]+$", "", text) # remove whitespace at end of documents
text <- gsub("[[:digit:]]", "", text) # remove numbers
text <- tolower(text)  # force to lowercase
text <- gsub("h ", "", text)
text <- gsub("w ", "", text)
text <- gsub("pm ", "", text)
text <- gsub("imagejpg", "", text)

emails$text <- text

# remove empty emails
empty <- emails$text == "" | emails$text == " "
emails <- emails[!empty, ]

dim(emails)
## [1] 7393   13

N-gram Frequencies

txt <- emails$text
txt <- removeWords(txt, words = stopwords("english"))

txt_TO <- txt[emails$from_to == "To Hillary"]
length(txt_TO)
## [1] 5432
txt_FROM <- txt[emails$from_to == "From Hillary"]
length(txt_FROM)
## [1] 1961
out.1 <- tau::textcnt(x = txt, 
                      method = "string", 
                      n = 1, 
                      decreasing = TRUE)

out.1_TO <- tau::textcnt(x = txt_TO, 
                      method = "string", 
                      n = 1, 
                      decreasing = TRUE)

out.1_FROM <- tau::textcnt(x = txt_FROM, 
                         method = "string", 
                         n = 1, 
                         decreasing = TRUE)

# Frequencies of bigrams
out.2 <- tau::textcnt(x = txt, 
                    method = "string", 
                    n = 2, 
                    decreasing = TRUE)

out.2_TO <- tau::textcnt(x = txt_TO, 
                      method = "string", 
                      n = 2, 
                      decreasing = TRUE)

out.2_FROM <- tau::textcnt(x = txt_FROM, 
                      method = "string", 
                      n = 2, 
                      decreasing = TRUE)

# Frequencies of trigrams
out.3 <- tau::textcnt(x = txt, 
                    method = "string", 
                    n = 3, 
                    decreasing = TRUE)

out.3_TO <- tau::textcnt(x = txt_TO, 
                      method = "string", 
                      n = 3, 
                      decreasing = TRUE)

out.3_FROM <- tau::textcnt(x = txt_FROM, 
                      method = "string", 
                      n = 3, 
                      decreasing = TRUE)
par(mfrow=c(2,3), mar=c(2,6,2,2))
# One words barplot
barplot(rev(head(out.1, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of terms\n(Combined)")

barplot(rev(head(out.1_TO, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of terms\n(To Hillary)")

barplot(rev(head(out.1_FROM, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of terms\n(From Hillary)")

# One words wordcloud
wordcloud(names(out.1), freq = out.1, scale = c(5, .05), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))


wordcloud(names(out.1_TO), freq = out.1, scale = c(5, .05), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

wordcloud(names(out.1_FROM), freq = out.1, scale = c(5, .05), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

par(mfrow=c(2,3), mar=c(2,8,2,3))
# Bigrams barplot
barplot(rev(head(out.2, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of bi-grams\n(Combined)")

barplot(rev(head(out.2_TO, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of bi-grams\n(To Hillary)")

barplot(rev(head(out.2_FROM, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of bi-grams\n(From Hillary)")

# Bigrams wordcloud
wordcloud(names(out.2), freq = out.2, scale = c(2, .0005), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

wordcloud(names(out.2_TO), freq = out.2, scale = c(2, .0005), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

wordcloud(names(out.2_FROM), freq = out.2, scale = c(2, .0005), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

par(mfrow=c(2,3), mar=c(2,12,2,3))
# Trigrams barplot
barplot(rev(head(out.3, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of tri-grams\n(Combined)")

barplot(rev(head(out.3_TO, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of tri-grams\n(To Hillary)")

barplot(rev(head(out.3_FROM, 20)), col ="orange",
        horiz = TRUE, las=1, main = "Frequency of tri-grams\n(From Hillary)")

# Trigrams wordcloud
wordcloud(names(out.3), freq = out.3, scale = c(2, .0005), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

wordcloud(names(out.3_TO), freq = out.3, scale = c(2, .0005), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

wordcloud(names(out.3_FROM), freq = out.3, scale = c(2, .0005), min.freq = 10,
          max.words = 150, random.order = FALSE, colors = brewer.pal(6,"Dark2"))

Topic Modeling

Latent Dirichlet Alocation (LDA)

# example @ http://cpsievert.github.io/LDAvis/reviews/reviews.html

# tokenize on space and output as a list:
doc.list <- strsplit(txt, "[[:space:]]+")

# compute the table of terms:
term.table <- table(unlist(doc.list))
term.table <- sort(term.table, decreasing = TRUE)

# read in some stopwords:
stop_words <- stopwords("SMART")

# remove terms that are stop words or occur fewer than 3 times or are "":
del <- names(term.table) %in% stop_words | term.table < 3 
term.table <- term.table[!del]

# Remove empty string term
head(names(term.table))
## [1] ""          "state"     "call"      "president" "fyi"       "secretary"
empty <- names(term.table) == ""
term.table <- term.table[!empty]

vocab <- names(term.table)

# now put the documents into the format required by the lda package:
get.terms <- function(x) {
        index <- match(x, vocab)
        index <- index[!is.na(index)]
        rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
        }

documents <- lapply(doc.list, get.terms)

# Compute some statistics related to the data set:
D <- length(documents)  # number of documents (7393)
W <- length(vocab)  # number of terms in the vocab (10296)
# number of tokens per document [7, 1, 14, 15, 118, ...]
doc.length <- sapply(documents, function(x) sum(x[2, ])) 

# total number of tokens in the data (198088)
N <- sum(doc.length)  

# frequencies of terms in the corpus [1114, 984, 837, 756, 666, ...]
term.frequency <- as.integer(term.table)  
# MCMC and model tuning parameters:
K <- 30 # no.topics
G <- 5000
alpha <- 0.02
eta <- 0.02

# Fit the model:
set.seed(357)
t1 <- Sys.time()
fit <- lda.collapsed.gibbs.sampler(documents = documents, K = K, vocab = vocab, 
                                   num.iterations = G, alpha = alpha, 
                                   eta = eta, initial = NULL, burnin = 0,
                                   compute.log.likelihood = TRUE)
t2 <- Sys.time()
t2 - t1  # 12.7376 mins
## Time difference of 13.17126 mins
# beepr::beep(0)
# Estimate document-topic distributions(D * K matrix θ)
# Set topic-term distributions (K * W matrix Ď•)

theta <- t(apply(fit$document_sums + alpha, 2, function(x) x/sum(x)))
phi <- t(apply(t(fit$topics) + eta, 2, function(x) x/sum(x)))

emails4LDA<- list(phi = phi,
                     theta = theta,
                     doc.length = doc.length,
                     vocab = vocab,
                     term.frequency = term.frequency)

saveRDS(emails4LDA, file = "emails4LDA.RDS")
emails4LDA <- readRDS("emails4LDA.RDS")

# create the JSON object to feed the visualization:
json <- createJSON(phi = emails4LDA$phi, 
                   theta = emails4LDA$theta, 
                   doc.length = emails4LDA$doc.length, 
                   vocab = emails4LDA$vocab, 
                   term.frequency = emails4LDA$term.frequency)

serVis(json, out.dir = 'LDAvis', open.browser = TRUE)

Visualization of the LDA analysis created with the LDAvis package. Click on this link or on the figure above to open the analysis in a web browser for interactive visualization.

Structural Topic Model

#stemming/stopword removal, etc.
emails$from_to <- factor(emails$from_to)
emails$released <- factor(emails$released)
emails$day_month <- factor(emails$day_month)
emails$year <- factor(emails$year)

stm_data <- emails[,c("released", "senderId", "to", "from",
                      "from_to", "emails","text", "rawtext", "date", 
                      "day_month", "day_week", "month", "year")]

month_names <- tolower(as.character(unique(emails$month)))
week_days_names <- tolower(as.character(unique(emails$day_week)))

Sentiment as Topic Model Covariate

# Compute the sentiment score on a [-1,1] range
txt <- tolower(stm_data$text)
t1 <- Sys.time()
sentiments <- polarity(txt,
                       polarity.frame = qdapDictionaries::key.pol,
                       negators = qdapDictionaries::negation.words,
                       amplifiers = qdapDictionaries::amplification.words,
                       deamplifiers = qdapDictionaries::deamplification.words, 
                       amplifier.weight = 0.8,
                       n.before = 4, 
                       n.after = 2,
                       constrain = TRUE)
t2 <- Sys.time()
t2-t1 # 2.439788 mins
## Time difference of 2.54607 mins
# beepr::beep(0)

# Add a column called sentiment to stm_data data.frame 
stm_data$sentiment <- sentiments$all$polarity

# General output
sentiments$group
##   all total.sentences total.words ave.polarity sd.polarity
## 1 all            7393      447235   0.03223761   0.1225835
##   stan.mean.polarity
## 1           0.262985
## Structure of the detailed output
# str(sentiments$all)

# Distribution of the sentiment - standard a lot of zero's
stm_data[which(is.na(stm_data$sentiment)),] # one email have no text -> leads to NaN sentiment
##             released senderId to                                     from
## 5945 RELEASE IN FULL       87    Sullivan, Jacob J <SullivanJJ@state.gov>
##         from_to       emails text
## 5945 To Hillary 202 647 8947     
##                                                                                                                                                                                                                                                                                                                                          rawtext
## 5945 UNCLASSIFIED U.S. Department of State Case No. F-2014-20439 Doc No. C05771817 Date: 08/31/2015 RELEASE IN FULL From: Sullivan, Jacob J <SullivanJJ@state.gov> Sent: Saturday, August 28, 2010 10:34 AM To: Subject: My fax # 202 647 8947 UNCLASSIFIED U.S. Department of State Case No. F-2014-20439 Doc No. C05771817 Date: 08/31/2015 \f
##            date day_month day_week  month year sentiment
## 5945 2010-08-28        28 Saturday August 2010       NaN
summary(stm_data$sentiment)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.59100  0.00000  0.00000  0.03224  0.07059  0.83920        1
par(mfrow=c(1,2))
hist(stm_data$sentiment, 
     col = "orange", 
     xlab = "sentiment [-1, 1]", 
     main = "")

boxplot(stm_data$sentiment, 
        range = F, 
        col = "orange",
        ylab = "sentiment [-1, 1]")

# Remove case with sentiment as NA (5801)
idx <- which(is.na(stm_data$sentiment))
stm_data <- stm_data[-idx,]

stop_words <- c(stop_words, "imagejpg", "Subject:", "AM", "PM", month_names, week_days_names)

processed <- textProcessor(stm_data$text, 
                           metadata=stm_data, 
                           stem = FALSE,
                           striphtml = TRUE,
                           customstopwords = stop_words,
                           verbose=FALSE)
# Choose frequency threshold
plotRemoved(processed$documents, lower.thresh=seq(from = 1, to = 50, by = 1))

#structure and index for usage in the stm model. Verify no-missingness.
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 3)
## Removing 23716 of 30834 terms (31999 of 170423 tokens) due to frequency 
## Removing 100 Documents with No Words 
## Your corpus now has 6889 documents, 7118 terms and 138424 tokens.
#output will have object meta, documents, and vocab
docs <- out$documents
vocab <- out$vocab
meta  <-out$meta
##############

# released and fromH as a covariate in the topic prevalence
t1 <- Sys.time()
emailsFit <- stm(out$documents,
                 out$vocab,
                 K=40,
                 prevalence = ~ sentiment + from_to + released + day_week + month + year, 
                 max.em.its = 500,
                 data=out$meta,
                 seed=5926696,
                 verbose=FALSE,
                 init.type="Spectral")
t2 <- Sys.time()
t2-t1 # 16.81741 mins (converge around iteration 160)
## Time difference of 20.19022 mins
# beepr::beep(0)

saveRDS(out, file = "out.RDS")
saveRDS(emailsFit, file = "emailsFit.RDS")
out <- readRDS("out.RDS")
emailsFit <- readRDS("emailsFit")

# stmBrowser
stmBrowser(mod = emailsFit, 
           data = out$meta, 
           covariates = c("sentiment", "from_to", "released", "day_week", "month", "year"),
           text = "emails",
           id = NULL,
           n = 7000, 
           labeltype ="frex") #prob

Visualization of the Structural Topic Model created with the stmBrowser package. Click on this link or on the figure above to open the analysis in a web browser for interactive visualization.

Session information

devtools::session_info()
## Session info --------------------------------------------------------------
##  setting  value                       
##  version  R version 3.2.3 (2015-12-10)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_GB.UTF-8                 
##  tz       Europe/London               
##  date     2016-02-14
## Packages ------------------------------------------------------------------
##  package          * version    date      
##  assertthat         0.1        2013-12-06
##  BBmisc           * 1.9        2015-02-03
##  bitops             1.0-6      2013-08-17
##  checkmate          1.7.1      2016-02-02
##  chron              2.3-47     2015-06-24
##  codetools          0.2-14     2015-07-15
##  colorspace         1.2-6      2015-03-11
##  data.table         1.9.6      2015-09-19
##  DBI                0.3.1      2014-09-24
##  devtools           1.10.0     2016-01-23
##  digest             0.6.9      2016-01-08
##  dplyr              0.4.3      2015-09-01
##  evaluate           0.8        2015-09-18
##  foreach            1.4.3      2015-10-13
##  formatR            1.2.1      2015-09-18
##  gdata              2.17.0     2015-07-04
##  gender             0.5.1      2015-09-04
##  ggplot2            2.0.0      2015-12-18
##  glmnet             2.0-2      2015-04-12
##  gridExtra          2.0.0      2015-07-14
##  gtable             0.1.2      2012-12-05
##  gtools             3.5.0      2015-05-29
##  htmltools          0.3        2015-12-29
##  httr               1.1.0      2016-01-28
##  igraph             1.0.1      2015-06-26
##  iterators          1.0.8      2015-10-13
##  knitr              1.12.3     2016-01-22
##  lattice            0.20-33    2015-07-14
##  lda              * 1.4.2      2015-11-22
##  LDAvis           * 0.3.2      2015-11-12
##  lubridate        * 1.5.0.9000 2016-02-14
##  magrittr           1.5        2014-11-22
##  Matrix             1.2-3      2015-11-28
##  matrixStats        0.50.1     2015-12-15
##  memoise            1.0.0      2016-01-29
##  munsell            0.4.2      2013-07-11
##  NLP              * 0.1-8      2015-07-03
##  openNLP            0.2-5      2015-05-06
##  openNLPdata        1.5.3-2    2015-06-24
##  plotrix            3.6-1      2015-12-20
##  plyr               1.8.3      2015-06-12
##  png              * 0.1-7      2013-12-03
##  qdap             * 2.2.4      2015-10-09
##  qdapDictionaries * 1.0.6      2015-05-21
##  qdapRegex        * 0.6.0      2015-12-13
##  qdapTools        * 1.3.1      2015-09-25
##  R6                 2.1.2      2016-01-26
##  RColorBrewer     * 1.1-2      2014-12-07
##  Rcpp               0.12.3     2016-01-10
##  RCurl              1.95-4.7   2015-06-30
##  reports            0.1.4      2014-12-21
##  reshape2           1.4.1      2014-12-06
##  rJava              0.9-8      2016-01-07
##  rmarkdown          0.9.2      2016-01-01
##  scales             0.3.0      2015-08-25
##  slam               0.1-32     2014-04-02
##  stm              * 1.1.3      2016-01-15
##  stmBrowser       * 1.0        2015-07-16
##  stringi          * 1.0-1      2015-10-22
##  stringr          * 1.0.0      2015-04-30
##  tau                0.0-18     2015-02-10
##  tm               * 0.6-2      2015-07-03
##  venneuler          1.1-0      2011-08-10
##  wordcloud        * 2.5        2014-06-13
##  xlsx               0.5.7      2014-08-02
##  xlsxjars           0.6.1      2014-08-22
##  XML                3.98-1.3   2015-06-30
##  yaml               2.1.13     2014-06-12
##  source                           
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.2)                   
##  Github (cpsievert/LDAvis@d749553)
##  Github (hadley/lubridate@ecfb0dc)
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.3)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)                   
##  CRAN (R 3.2.0)

This document was processed on: 2016-02-14.